# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Loading the datasets and making the date the index of the dataframe
df = pd.read_csv("raw_toronto_weather_data.csv", index_col="date")
df = df.sort_values(by='date')
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> Index: 10021 entries, 1996-12-30 to 2024-06-06 Data columns (total 71 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 max_temperature 10021 non-null float64 1 avg_hourly_temperature 10021 non-null float64 2 avg_temperature 10021 non-null float64 3 min_temperature 10021 non-null float64 4 max_humidex 2823 non-null float64 5 min_windchill 3308 non-null float64 6 max_relative_humidity 10021 non-null int64 7 avg_hourly_relative_humidity 10021 non-null float64 8 avg_relative_humidity 10021 non-null float64 9 min_relative_humidity 10021 non-null int64 10 max_dew_point 10021 non-null float64 11 avg_hourly_dew_point 10021 non-null float64 12 avg_dew_point 10021 non-null float64 13 min_dew_point 10021 non-null float64 14 max_wind_speed 10021 non-null int64 15 avg_hourly_wind_speed 10021 non-null float64 16 avg_wind_speed 10021 non-null float64 17 min_wind_speed 10021 non-null int64 18 max_wind_gust 7084 non-null float64 19 wind_gust_dir_10s 7084 non-null float64 20 max_pressure_sea 10021 non-null float64 21 avg_hourly_pressure_sea 10021 non-null float64 22 avg_pressure_sea 10021 non-null float64 23 min_pressure_sea 10021 non-null float64 24 max_pressure_station 10021 non-null float64 25 avg_hourly_pressure_station 10021 non-null float64 26 avg_pressure_station 10021 non-null float64 27 min_pressure_station 10021 non-null float64 28 max_visibility 10021 non-null int64 29 avg_hourly_visibility 10021 non-null float64 30 avg_visibility 10021 non-null int64 31 min_visibility 10021 non-null int64 32 max_health_index 240 non-null float64 33 avg_hourly_health_index 240 non-null float64 34 avg_health_index 240 non-null float64 35 min_health_index 240 non-null float64 36 heatdegdays 10021 non-null float64 37 cooldegdays 10021 non-null float64 38 growdegdays_5 10021 non-null float64 39 growdegdays_7 10021 non-null float64 40 growdegdays_10 10021 non-null float64 41 precipitation 9995 non-null float64 42 rain 9984 non-null float64 43 snow 9994 non-null float64 44 snow_on_ground 7020 non-null float64 45 sunrise_hhmm 3810 non-null object 46 sunrise_unixtime 3810 non-null float64 47 sunrise_f 3810 non-null float64 48 sunset_hhmm 3810 non-null object 49 sunset_unixtime 3810 non-null float64 50 sunset_f 3810 non-null float64 51 daylight 3810 non-null float64 52 min_uv_forecast 3148 non-null float64 53 max_uv_forecast 3148 non-null float64 54 min_high_temperature_forecast 3810 non-null float64 55 max_high_temperature_forecast 3810 non-null float64 56 min_low_temperature_forecast 3810 non-null float64 57 max_low_temperature_forecast 3810 non-null float64 58 solar_radiation 0 non-null float64 59 max_cloud_cover_4 0 non-null float64 60 avg_hourly_cloud_cover_4 0 non-null float64 61 avg_cloud_cover_4 0 non-null float64 62 min_cloud_cover_4 0 non-null float64 63 max_cloud_cover_8 3894 non-null float64 64 avg_hourly_cloud_cover_8 3894 non-null float64 65 avg_cloud_cover_8 3894 non-null float64 66 min_cloud_cover_8 3894 non-null float64 67 max_cloud_cover_10 237 non-null float64 68 avg_hourly_cloud_cover_10 237 non-null float64 69 avg_cloud_cover_10 237 non-null float64 70 min_cloud_cover_10 237 non-null float64 dtypes: float64(62), int64(7), object(2) memory usage: 5.5+ MB
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_humidex | min_windchill | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | ... | avg_cloud_cover_4 | min_cloud_cover_4 | max_cloud_cover_8 | avg_hourly_cloud_cover_8 | avg_cloud_cover_8 | min_cloud_cover_8 | max_cloud_cover_10 | avg_hourly_cloud_cover_10 | avg_cloud_cover_10 | min_cloud_cover_10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 1996-12-30 | -2.0 | -5.63 | -4.84 | -7.7 | NaN | -15.0 | 86 | 75.9 | 71.0 | 56 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1996-12-31 | -7.3 | -11.12 | -11.00 | -14.7 | NaN | -23.0 | 90 | 77.2 | 77.5 | 65 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-01 | -3.0 | -7.96 | -8.80 | -14.6 | NaN | -22.0 | 98 | 91.4 | 91.0 | 84 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-02 | 4.1 | 0.95 | 0.44 | -3.2 | NaN | -4.0 | 100 | 97.8 | 96.5 | 93 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-03 | 5.6 | 2.95 | 3.40 | 1.2 | NaN | NaN | 100 | 90.8 | 91.0 | 82 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 71 columns
# Removing the first two rows of year 1996
df = df.drop(df.index[:2]) # Removes the first two rows
df.shape
(10019, 71)
# Finding missing rows
df[df.isnull().any(axis=1)]
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_humidex | min_windchill | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | ... | avg_cloud_cover_4 | min_cloud_cover_4 | max_cloud_cover_8 | avg_hourly_cloud_cover_8 | avg_cloud_cover_8 | min_cloud_cover_8 | max_cloud_cover_10 | avg_hourly_cloud_cover_10 | avg_cloud_cover_10 | min_cloud_cover_10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 1997-01-01 | -3.0 | -7.96 | -8.80 | -14.6 | NaN | -22.0 | 98 | 91.4 | 91.0 | 84 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-02 | 4.1 | 0.95 | 0.44 | -3.2 | NaN | -4.0 | 100 | 97.8 | 96.5 | 93 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-03 | 5.6 | 2.95 | 3.40 | 1.2 | NaN | NaN | 100 | 90.8 | 91.0 | 82 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-04 | 3.4 | 2.35 | 2.20 | 1.0 | NaN | NaN | 100 | 91.6 | 91.0 | 82 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1997-01-05 | 10.1 | 3.48 | 4.20 | -1.7 | NaN | -9.0 | 100 | 86.5 | 83.0 | 66 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-06-02 | 18.2 | 16.21 | 16.30 | 14.4 | NaN | NaN | 100 | 82.8 | 74.5 | 49 | ... | NaN | NaN | 8.0 | 7.8 | 6.5 | 5.0 | NaN | NaN | NaN | NaN |
| 2024-06-03 | 23.4 | 18.42 | 19.54 | 15.7 | 29.0 | NaN | 100 | 90.8 | 85.5 | 71 | ... | NaN | NaN | 8.0 | 4.5 | 4.0 | 0.0 | NaN | NaN | NaN | NaN |
| 2024-06-04 | 26.8 | 21.17 | 21.05 | 15.3 | 33.0 | NaN | 99 | 75.8 | 79.5 | 60 | ... | NaN | NaN | 8.0 | 4.8 | 4.0 | 0.0 | NaN | NaN | NaN | NaN |
| 2024-06-05 | 25.6 | 21.68 | 21.70 | 17.8 | 33.0 | NaN | 100 | 77.3 | 84.0 | 68 | ... | NaN | NaN | 8.0 | 5.8 | 4.0 | 0.0 | NaN | NaN | NaN | NaN |
| 2024-06-06 | 26.7 | 21.31 | 21.10 | 15.5 | 28.0 | NaN | 100 | 68.8 | 68.5 | 37 | ... | NaN | NaN | 8.0 | 5.4 | 4.5 | 1.0 | NaN | NaN | NaN | NaN |
10019 rows × 71 columns
# Checking for the number of missing values before data cleaning
df.isna().sum()
max_temperature 0
avg_hourly_temperature 0
avg_temperature 0
min_temperature 0
max_humidex 7196
...
min_cloud_cover_8 6125
max_cloud_cover_10 9782
avg_hourly_cloud_cover_10 9782
avg_cloud_cover_10 9782
min_cloud_cover_10 9782
Length: 71, dtype: int64
# Calculating the percentage of null values
null_pct = round(df.isna().sum()*100 / len(df), 2)
null_pct
max_temperature 0.00
avg_hourly_temperature 0.00
avg_temperature 0.00
min_temperature 0.00
max_humidex 71.82
...
min_cloud_cover_8 61.13
max_cloud_cover_10 97.63
avg_hourly_cloud_cover_10 97.63
avg_cloud_cover_10 97.63
min_cloud_cover_10 97.63
Length: 71, dtype: float64
# Getting columns with less than 40% missing values
valid_cols = df.columns[null_pct < .40]
valid_cols
Index(['max_temperature', 'avg_hourly_temperature', 'avg_temperature',
'min_temperature', 'max_relative_humidity',
'avg_hourly_relative_humidity', 'avg_relative_humidity',
'min_relative_humidity', 'max_dew_point', 'avg_hourly_dew_point',
'avg_dew_point', 'min_dew_point', 'max_wind_speed',
'avg_hourly_wind_speed', 'avg_wind_speed', 'min_wind_speed',
'max_pressure_sea', 'avg_hourly_pressure_sea', 'avg_pressure_sea',
'min_pressure_sea', 'max_pressure_station',
'avg_hourly_pressure_station', 'avg_pressure_station',
'min_pressure_station', 'max_visibility', 'avg_hourly_visibility',
'avg_visibility', 'min_visibility', 'heatdegdays', 'cooldegdays',
'growdegdays_5', 'growdegdays_7', 'growdegdays_10', 'precipitation',
'rain', 'snow'],
dtype='object')
# Assigning the valid columns only to df
df = df[valid_cols].copy()
# Checking for the number of missing values before cleaning
df.isna().sum()
max_temperature 0 avg_hourly_temperature 0 avg_temperature 0 min_temperature 0 max_relative_humidity 0 avg_hourly_relative_humidity 0 avg_relative_humidity 0 min_relative_humidity 0 max_dew_point 0 avg_hourly_dew_point 0 avg_dew_point 0 min_dew_point 0 max_wind_speed 0 avg_hourly_wind_speed 0 avg_wind_speed 0 min_wind_speed 0 max_pressure_sea 0 avg_hourly_pressure_sea 0 avg_pressure_sea 0 min_pressure_sea 0 max_pressure_station 0 avg_hourly_pressure_station 0 avg_pressure_station 0 min_pressure_station 0 max_visibility 0 avg_hourly_visibility 0 avg_visibility 0 min_visibility 0 heatdegdays 0 cooldegdays 0 growdegdays_5 0 growdegdays_7 0 growdegdays_10 0 precipitation 26 rain 37 snow 27 dtype: int64
# Missing Data heatmap before cleaning
sns.heatmap(df.isna(), yticklabels=False, cbar=False, cmap='rocket')
plt.show()
# Filling in the missing values for precipitation, rain and snow
df = df.ffill()
# Checking for the number of missing values post-cleaning
df.isna().sum()
max_temperature 0 avg_hourly_temperature 0 avg_temperature 0 min_temperature 0 max_relative_humidity 0 avg_hourly_relative_humidity 0 avg_relative_humidity 0 min_relative_humidity 0 max_dew_point 0 avg_hourly_dew_point 0 avg_dew_point 0 min_dew_point 0 max_wind_speed 0 avg_hourly_wind_speed 0 avg_wind_speed 0 min_wind_speed 0 max_pressure_sea 0 avg_hourly_pressure_sea 0 avg_pressure_sea 0 min_pressure_sea 0 max_pressure_station 0 avg_hourly_pressure_station 0 avg_pressure_station 0 min_pressure_station 0 max_visibility 0 avg_hourly_visibility 0 avg_visibility 0 min_visibility 0 heatdegdays 0 cooldegdays 0 growdegdays_5 0 growdegdays_7 0 growdegdays_10 0 precipitation 0 rain 0 snow 0 dtype: int64
# Getting a Statistical Summary of the data
df.describe(include="all")
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | max_dew_point | avg_hourly_dew_point | ... | avg_visibility | min_visibility | heatdegdays | cooldegdays | growdegdays_5 | growdegdays_7 | growdegdays_10 | precipitation | rain | snow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | ... | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 | 10019.000000 |
| mean | 13.630752 | 9.143915 | 9.010631 | 4.392035 | 85.401637 | 69.332628 | 69.030342 | 52.659048 | 6.523206 | 3.346003 | ... | 19789.799381 | 14947.509732 | 10.027927 | 1.038677 | 6.749805 | 5.573540 | 4.027478 | 2.160196 | 1.859158 | 0.320830 |
| std | 11.315122 | 10.480384 | 10.545357 | 10.042189 | 10.221874 | 12.286544 | 11.172906 | 14.556967 | 9.682561 | 9.960977 | ... | 5173.366838 | 8906.128122 | 9.247519 | 2.201631 | 7.208566 | 6.469344 | 5.293412 | 5.388539 | 5.191014 | 1.532879 |
| min | -19.100000 | -21.950000 | -22.300000 | -26.300000 | 38.000000 | 27.700000 | 29.500000 | 13.000000 | -24.800000 | -28.100000 | ... | 2000.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 4.000000 | 0.930000 | 0.840000 | -2.600000 | 79.000000 | 61.300000 | 61.500000 | 42.000000 | -0.900000 | -4.200000 | ... | 15250.000000 | 6400.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 14.000000 | 9.150000 | 9.050000 | 4.300000 | 87.000000 | 69.700000 | 69.500000 | 52.000000 | 6.800000 | 3.300000 | ... | 20700.000000 | 16100.000000 | 8.900000 | 0.000000 | 4.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 23.600000 | 18.520000 | 18.390000 | 13.100000 | 93.000000 | 77.700000 | 77.000000 | 62.500000 | 14.800000 | 11.800000 | ... | 24100.000000 | 24100.000000 | 17.200000 | 0.400000 | 13.400000 | 11.400000 | 8.400000 | 1.400000 | 0.600000 | 0.000000 |
| max | 37.900000 | 31.700000 | 31.950000 | 26.300000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 26.200000 | 24.400000 | ... | 52300.000000 | 25000.000000 | 40.300000 | 13.900000 | 26.900000 | 24.900000 | 21.900000 | 126.000000 | 126.000000 | 30.400000 |
8 rows × 36 columns
# Checking the data types of the attributes
df.dtypes
max_temperature float64 avg_hourly_temperature float64 avg_temperature float64 min_temperature float64 max_relative_humidity int64 avg_hourly_relative_humidity float64 avg_relative_humidity float64 min_relative_humidity int64 max_dew_point float64 avg_hourly_dew_point float64 avg_dew_point float64 min_dew_point float64 max_wind_speed int64 avg_hourly_wind_speed float64 avg_wind_speed float64 min_wind_speed int64 max_pressure_sea float64 avg_hourly_pressure_sea float64 avg_pressure_sea float64 min_pressure_sea float64 max_pressure_station float64 avg_hourly_pressure_station float64 avg_pressure_station float64 min_pressure_station float64 max_visibility int64 avg_hourly_visibility float64 avg_visibility int64 min_visibility int64 heatdegdays float64 cooldegdays float64 growdegdays_5 float64 growdegdays_7 float64 growdegdays_10 float64 precipitation float64 rain float64 snow float64 dtype: object
# Checking the datatype of the index (i.e. the dates)
df.index
Index(['1997-01-01', '1997-01-02', '1997-01-03', '1997-01-04', '1997-01-05',
'1997-01-06', '1997-01-07', '1997-01-08', '1997-01-09', '1997-01-10',
...
'2024-05-28', '2024-05-29', '2024-05-30', '2024-05-31', '2024-06-01',
'2024-06-02', '2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06'],
dtype='object', name='date', length=10019)
# Converting the index datatype from object to date
df.index = pd.to_datetime(df.index)
df.index
DatetimeIndex(['1997-01-01', '1997-01-02', '1997-01-03', '1997-01-04',
'1997-01-05', '1997-01-06', '1997-01-07', '1997-01-08',
'1997-01-09', '1997-01-10',
...
'2024-05-28', '2024-05-29', '2024-05-30', '2024-05-31',
'2024-06-01', '2024-06-02', '2024-06-03', '2024-06-04',
'2024-06-05', '2024-06-06'],
dtype='datetime64[ns]', name='date', length=10019, freq=None)
# Checking the data according to the years
df.index.year.value_counts().sort_index()
date 1997 365 1998 365 1999 365 2000 366 2001 365 2002 365 2003 365 2004 366 2005 365 2006 365 2007 365 2008 366 2009 365 2010 365 2011 365 2012 366 2013 365 2014 365 2015 365 2016 366 2017 365 2018 365 2019 365 2020 366 2021 365 2022 365 2023 365 2024 158 Name: count, dtype: int64
# Correlation heatmap
plt.figure(figsize=(30, 30))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# Histogram for temperature
plt.figure(figsize=(10, 6))
sns.histplot(df['avg_temperature'], bins=30, kde=True)
plt.title('Distribution of Average Temperature')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.show()
# Histogram for rain
plt.figure(figsize=(10, 6))
sns.histplot(df['rain'], bins=30, kde=True)
plt.title('Distribution of Rainfall')
plt.xlabel('Rain')
plt.ylabel('Frequency')
plt.show()
# Histogram for precipitation
plt.figure(figsize=(10, 6))
sns.histplot(df['precipitation'], bins=30, kde=True)
plt.title('Distribution of Precipitation')
plt.xlabel('Precipitation')
plt.ylabel('Frequency')
plt.show()
# Histogram for relative humidity
plt.figure(figsize=(10, 6))
sns.histplot(df['avg_relative_humidity'], bins=30, kde=True)
plt.title('Distribution of Relative Humudity')
plt.xlabel('Relative Humidity')
plt.ylabel('Frequency')
plt.show()
# Box plot for temperature by month
df['month'] = df.index.month
plt.figure(figsize=(14, 7))
sns.boxplot(x='month', y='avg_temperature', data=df)
plt.title('Monthly Average Temperature Distribution')
plt.xlabel('Month')
plt.ylabel('Temperature (°C)')
plt.show()
# Scatter plot for temperature vs. humidity
plt.figure(figsize=(10, 6))
sns.scatterplot(x='avg_temperature', y='avg_relative_humidity', data=df)
plt.title('Temperature vs. Humidity')
plt.xlabel('Average Temperature (°C)')
plt.ylabel('Average Relative Humidity (%)')
plt.show()
df['max_temperature'].plot()
<Axes: xlabel='date'>
df['snow'].plot()
<Axes: xlabel='date'>
df['precipitation'].plot()
<Axes: xlabel='date'>
df['rain'].plot()
<Axes: xlabel='date'>
df['cooldegdays'].plot()
<Axes: xlabel='date'>
df['max_wind_speed'].plot()
<Axes: xlabel='date'>
# Missing Data heatmap post-cleaning
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='rocket')
plt.show()
# Correlation plot
plt.figure(figsize=(30,30))
sns.heatmap(df.select_dtypes(include=['number']).corr(),annot=True,cmap='RdBu')
plt.title("Correlation",fontsize=40)
plt.show()
# Scatter Plot for max_temperature vs rain
sns.scatterplot(x='rain', y='max_temperature', data=df)
plt.title('max_temperature vs. rain')
plt.show()
# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['rain'], df['max_temperature'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.10130268238898475
# Scatter Plot for max_tempeature vs avg_relative_humidity
sns.scatterplot(x='avg_relative_humidity', y='max_temperature', data=df)
plt.title('max_tempeature vs. avg_relative_humidity')
plt.show()
# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['avg_relative_humidity'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: -0.18860673181518933
# Scatter Plot for max_temperature vs avg_dew_point
sns.scatterplot(x='max_temperature', y='avg_dew_point', data=df)
plt.title('max_temperature vs. avg_dew_point')
plt.show()
# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['avg_dew_point'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.9333212084360358
# Scatter Plot for rain vs avg_pressure_sea
sns.scatterplot(x='avg_pressure_sea', y='rain', data=df)
plt.title('rain vs. avg_pressure_sea')
plt.show()
# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['avg_pressure_sea'], df['rain'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: -0.27634540616698666
# Scatter Plot for max_temperature vs cooldegdays
sns.scatterplot(x='max_temperature', y='cooldegdays', data=df)
plt.title('max_temperature vs. cooldegdays')
plt.show()
# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['cooldegdays'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.6514285642421251
# Setting the next day's max_temperature value as the target of the current day's features
df['y'] = df.shift(-1)['max_temperature']
df
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | max_dew_point | avg_hourly_dew_point | ... | heatdegdays | cooldegdays | growdegdays_5 | growdegdays_7 | growdegdays_10 | precipitation | rain | snow | month | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 1997-01-01 | -3.0 | -7.96 | -8.80 | -14.6 | 98 | 91.4 | 91.0 | 84 | -3.6 | -9.1 | ... | 26.8 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 4.0 | 1 | 4.1 |
| 1997-01-02 | 4.1 | 0.95 | 0.44 | -3.2 | 100 | 97.8 | 96.5 | 93 | 3.1 | 0.6 | ... | 17.6 | 0.0 | 0.0 | 0.0 | 0.0 | 1.4 | 1.4 | 0.0 | 1 | 5.6 |
| 1997-01-03 | 5.6 | 2.95 | 3.40 | 1.2 | 100 | 90.8 | 91.0 | 82 | 5.2 | 1.6 | ... | 14.6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 3.4 |
| 1997-01-04 | 3.4 | 2.35 | 2.20 | 1.0 | 100 | 91.6 | 91.0 | 82 | 3.3 | 1.1 | ... | 15.8 | 0.0 | 0.0 | 0.0 | 0.0 | 4.2 | 4.2 | 0.0 | 1 | 10.1 |
| 1997-01-05 | 10.1 | 3.48 | 4.20 | -1.7 | 100 | 86.5 | 83.0 | 66 | 8.3 | 1.3 | ... | 13.8 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 1 | -1.6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-06-02 | 18.2 | 16.21 | 16.30 | 14.4 | 100 | 82.8 | 74.5 | 49 | 17.0 | 12.9 | ... | 1.7 | 0.0 | 11.3 | 9.3 | 6.3 | 7.0 | 7.0 | 0.0 | 6 | 23.4 |
| 2024-06-03 | 23.4 | 18.42 | 19.54 | 15.7 | 100 | 90.8 | 85.5 | 71 | 20.3 | 16.8 | ... | 0.0 | 1.5 | 14.5 | 12.5 | 9.5 | 0.0 | 0.0 | 0.0 | 6 | 26.8 |
| 2024-06-04 | 26.8 | 21.17 | 21.05 | 15.3 | 99 | 75.8 | 79.5 | 60 | 19.1 | 16.5 | ... | 0.0 | 3.0 | 16.1 | 14.1 | 11.1 | 0.0 | 0.0 | 0.0 | 6 | 25.6 |
| 2024-06-05 | 25.6 | 21.68 | 21.70 | 17.8 | 100 | 77.3 | 84.0 | 68 | 20.8 | 17.5 | ... | 0.0 | 3.7 | 16.7 | 14.7 | 11.7 | 3.2 | 3.2 | 0.0 | 6 | 26.7 |
| 2024-06-06 | 26.7 | 21.31 | 21.10 | 15.5 | 100 | 68.8 | 68.5 | 37 | 19.9 | 14.4 | ... | 0.0 | 3.1 | 16.1 | 14.1 | 11.1 | 0.8 | 0.8 | 0.0 | 6 | NaN |
10019 rows × 38 columns
# Handling the last row missing the target data
df = df.ffill()
df
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | max_dew_point | avg_hourly_dew_point | ... | heatdegdays | cooldegdays | growdegdays_5 | growdegdays_7 | growdegdays_10 | precipitation | rain | snow | month | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 1997-01-01 | -3.0 | -7.96 | -8.80 | -14.6 | 98 | 91.4 | 91.0 | 84 | -3.6 | -9.1 | ... | 26.8 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 0.0 | 4.0 | 1 | 4.1 |
| 1997-01-02 | 4.1 | 0.95 | 0.44 | -3.2 | 100 | 97.8 | 96.5 | 93 | 3.1 | 0.6 | ... | 17.6 | 0.0 | 0.0 | 0.0 | 0.0 | 1.4 | 1.4 | 0.0 | 1 | 5.6 |
| 1997-01-03 | 5.6 | 2.95 | 3.40 | 1.2 | 100 | 90.8 | 91.0 | 82 | 5.2 | 1.6 | ... | 14.6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 3.4 |
| 1997-01-04 | 3.4 | 2.35 | 2.20 | 1.0 | 100 | 91.6 | 91.0 | 82 | 3.3 | 1.1 | ... | 15.8 | 0.0 | 0.0 | 0.0 | 0.0 | 4.2 | 4.2 | 0.0 | 1 | 10.1 |
| 1997-01-05 | 10.1 | 3.48 | 4.20 | -1.7 | 100 | 86.5 | 83.0 | 66 | 8.3 | 1.3 | ... | 13.8 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 1 | -1.6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-06-02 | 18.2 | 16.21 | 16.30 | 14.4 | 100 | 82.8 | 74.5 | 49 | 17.0 | 12.9 | ... | 1.7 | 0.0 | 11.3 | 9.3 | 6.3 | 7.0 | 7.0 | 0.0 | 6 | 23.4 |
| 2024-06-03 | 23.4 | 18.42 | 19.54 | 15.7 | 100 | 90.8 | 85.5 | 71 | 20.3 | 16.8 | ... | 0.0 | 1.5 | 14.5 | 12.5 | 9.5 | 0.0 | 0.0 | 0.0 | 6 | 26.8 |
| 2024-06-04 | 26.8 | 21.17 | 21.05 | 15.3 | 99 | 75.8 | 79.5 | 60 | 19.1 | 16.5 | ... | 0.0 | 3.0 | 16.1 | 14.1 | 11.1 | 0.0 | 0.0 | 0.0 | 6 | 25.6 |
| 2024-06-05 | 25.6 | 21.68 | 21.70 | 17.8 | 100 | 77.3 | 84.0 | 68 | 20.8 | 17.5 | ... | 0.0 | 3.7 | 16.7 | 14.7 | 11.7 | 3.2 | 3.2 | 0.0 | 6 | 26.7 |
| 2024-06-06 | 26.7 | 21.31 | 21.10 | 15.5 | 100 | 68.8 | 68.5 | 37 | 19.9 | 14.4 | ... | 0.0 | 3.1 | 16.1 | 14.1 | 11.1 | 0.8 | 0.8 | 0.0 | 6 | 26.7 |
10019 rows × 38 columns
# Checking the correlation between the target and the features
df.corr()
| max_temperature | avg_hourly_temperature | avg_temperature | min_temperature | max_relative_humidity | avg_hourly_relative_humidity | avg_relative_humidity | min_relative_humidity | max_dew_point | avg_hourly_dew_point | ... | heatdegdays | cooldegdays | growdegdays_5 | growdegdays_7 | growdegdays_10 | precipitation | rain | snow | month | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| max_temperature | 1.000000 | 0.989004 | 0.989004 | 0.950527 | 0.027333 | -0.192890 | -0.188607 | -0.308716 | 0.933503 | 0.932883 | ... | -0.972667 | 0.651429 | 0.924878 | 0.901518 | 0.859081 | 0.027299 | 0.101303 | -0.268868 | 0.270073 | 0.927880 |
| avg_hourly_temperature | 0.989004 | 1.000000 | 0.997628 | 0.981032 | 0.063368 | -0.136254 | -0.128796 | -0.242207 | 0.949732 | 0.959909 | ... | -0.982135 | 0.652993 | 0.926195 | 0.903849 | 0.862624 | 0.051095 | 0.124014 | -0.262688 | 0.296015 | 0.930555 |
| avg_temperature | 0.989004 | 0.997628 | 1.000000 | 0.986016 | 0.071547 | -0.125816 | -0.120829 | -0.235720 | 0.950102 | 0.960796 | ... | -0.983774 | 0.657466 | 0.927268 | 0.904992 | 0.864283 | 0.057854 | 0.131066 | -0.263320 | 0.295486 | 0.930185 |
| min_temperature | 0.950527 | 0.981032 | 0.986016 | 1.000000 | 0.119496 | -0.046906 | -0.041258 | -0.147244 | 0.943749 | 0.966910 | ... | -0.970362 | 0.646838 | 0.905461 | 0.884988 | 0.847292 | 0.090772 | 0.161167 | -0.250135 | 0.316347 | 0.908248 |
| max_relative_humidity | 0.027333 | 0.063368 | 0.071547 | 0.119496 | 1.000000 | 0.837028 | 0.857901 | 0.614731 | 0.307957 | 0.295843 | ... | -0.085951 | -0.018506 | 0.012932 | 0.009350 | 0.003303 | 0.362622 | 0.338861 | 0.128348 | 0.184884 | -0.024880 |
| avg_hourly_relative_humidity | -0.192890 | -0.136254 | -0.125816 | -0.046906 | 0.837028 | 1.000000 | 0.971945 | 0.904234 | 0.121739 | 0.143392 | ... | 0.115042 | -0.119546 | -0.163160 | -0.157998 | -0.150033 | 0.407994 | 0.371475 | 0.178007 | 0.151327 | -0.214626 |
| avg_relative_humidity | -0.188607 | -0.128796 | -0.120829 | -0.041258 | 0.857901 | 0.971945 | 1.000000 | 0.932643 | 0.127504 | 0.143053 | ... | 0.109676 | -0.118115 | -0.158784 | -0.153300 | -0.145361 | 0.381645 | 0.345622 | 0.172243 | 0.142302 | -0.210255 |
| min_relative_humidity | -0.308716 | -0.242207 | -0.235720 | -0.147244 | 0.614731 | 0.904234 | 0.932643 | 1.000000 | -0.020520 | 0.011855 | ... | 0.228715 | -0.168318 | -0.252824 | -0.241889 | -0.225457 | 0.331215 | 0.292603 | 0.174277 | 0.088616 | -0.305284 |
| max_dew_point | 0.933503 | 0.949732 | 0.950102 | 0.943749 | 0.307957 | 0.121739 | 0.127504 | -0.020520 | 1.000000 | 0.983736 | ... | -0.938001 | 0.610683 | 0.874464 | 0.852964 | 0.812578 | 0.168162 | 0.231609 | -0.214667 | 0.323504 | 0.853536 |
| avg_hourly_dew_point | 0.932883 | 0.959909 | 0.960796 | 0.966910 | 0.295843 | 0.143392 | 0.143053 | 0.011855 | 0.983736 | 1.000000 | ... | -0.947515 | 0.621960 | 0.882097 | 0.861660 | 0.823253 | 0.157889 | 0.222395 | -0.219858 | 0.340616 | 0.869052 |
| avg_dew_point | 0.933321 | 0.959152 | 0.961425 | 0.967738 | 0.292715 | 0.134383 | 0.138772 | 0.007479 | 0.985909 | 0.997046 | ... | -0.948272 | 0.621816 | 0.882576 | 0.862011 | 0.823427 | 0.152258 | 0.217584 | -0.223922 | 0.342908 | 0.871293 |
| min_dew_point | 0.909369 | 0.943523 | 0.947528 | 0.965537 | 0.271016 | 0.142836 | 0.145795 | 0.033497 | 0.947615 | 0.984101 | ... | -0.933718 | 0.616420 | 0.867674 | 0.848510 | 0.812607 | 0.133528 | 0.198947 | -0.226887 | 0.352276 | 0.865701 |
| max_wind_speed | -0.170897 | -0.189302 | -0.187364 | -0.200973 | -0.007199 | -0.036928 | -0.027482 | -0.037131 | -0.138182 | -0.195519 | ... | 0.196811 | -0.070648 | -0.173766 | -0.168656 | -0.156746 | 0.109196 | 0.089280 | 0.079964 | -0.107901 | -0.296579 |
| avg_hourly_wind_speed | -0.271283 | -0.275519 | -0.274141 | -0.270128 | -0.071220 | -0.075754 | -0.055942 | -0.035865 | -0.243463 | -0.291787 | ... | 0.281126 | -0.132145 | -0.251353 | -0.243120 | -0.226597 | 0.042786 | 0.013014 | 0.106016 | -0.113260 | -0.362065 |
| avg_wind_speed | -0.234867 | -0.242977 | -0.240522 | -0.240544 | -0.039014 | -0.044383 | -0.032139 | -0.021939 | -0.202055 | -0.250661 | ... | 0.249283 | -0.104861 | -0.221058 | -0.213714 | -0.198479 | 0.079611 | 0.054756 | 0.094843 | -0.104460 | -0.341463 |
| min_wind_speed | -0.286538 | -0.270834 | -0.268147 | -0.240347 | -0.090977 | -0.044662 | -0.030822 | 0.016571 | -0.264605 | -0.278925 | ... | 0.272548 | -0.139504 | -0.243246 | -0.233808 | -0.216911 | -0.007837 | -0.034412 | 0.093283 | -0.062148 | -0.318070 |
| max_pressure_sea | -0.352729 | -0.382000 | -0.387217 | -0.415880 | -0.291259 | -0.239766 | -0.246548 | -0.173945 | -0.436057 | -0.446047 | ... | 0.383054 | -0.245844 | -0.324440 | -0.315708 | -0.302972 | -0.205508 | -0.221732 | 0.032997 | -0.039681 | -0.241204 |
| avg_hourly_pressure_sea | -0.224253 | -0.248058 | -0.251930 | -0.276491 | -0.352745 | -0.314724 | -0.321792 | -0.246272 | -0.347237 | -0.332467 | ... | 0.251730 | -0.149402 | -0.185669 | -0.179202 | -0.171062 | -0.282842 | -0.277438 | -0.049833 | 0.001910 | -0.088188 |
| avg_pressure_sea | -0.216687 | -0.240394 | -0.245286 | -0.271059 | -0.348941 | -0.309051 | -0.319080 | -0.244782 | -0.340101 | -0.323652 | ... | 0.244250 | -0.149032 | -0.181043 | -0.175183 | -0.168063 | -0.281398 | -0.276345 | -0.048853 | 0.000942 | -0.087902 |
| min_pressure_sea | -0.071091 | -0.087258 | -0.091344 | -0.111775 | -0.363896 | -0.338677 | -0.350543 | -0.282577 | -0.217536 | -0.178997 | ... | 0.093224 | -0.046017 | -0.032589 | -0.029949 | -0.028656 | -0.319910 | -0.296216 | -0.117482 | 0.037430 | 0.059657 |
| max_pressure_station | -0.272381 | -0.300447 | -0.305794 | -0.335392 | -0.293845 | -0.258518 | -0.265030 | -0.200500 | -0.361824 | -0.369887 | ... | 0.302440 | -0.194458 | -0.249536 | -0.242748 | -0.233554 | -0.205386 | -0.215306 | 0.009625 | -0.011517 | -0.161050 |
| avg_hourly_pressure_station | -0.140921 | -0.164038 | -0.168181 | -0.194479 | -0.353146 | -0.331888 | -0.338497 | -0.271634 | -0.269363 | -0.253383 | ... | 0.168965 | -0.095910 | -0.108151 | -0.103667 | -0.099183 | -0.283084 | -0.271096 | -0.074043 | 0.028890 | -0.007683 |
| avg_pressure_station | -0.130336 | -0.153374 | -0.158430 | -0.185926 | -0.349135 | -0.326247 | -0.335916 | -0.270489 | -0.259186 | -0.241601 | ... | 0.158468 | -0.093336 | -0.100598 | -0.096787 | -0.093449 | -0.280606 | -0.268740 | -0.073835 | 0.029056 | -0.004565 |
| min_pressure_station | 0.008058 | -0.008145 | -0.012425 | -0.035199 | -0.361371 | -0.351776 | -0.363189 | -0.303763 | -0.141988 | -0.103687 | ... | 0.015444 | 0.005274 | 0.040781 | 0.041591 | 0.039458 | -0.317510 | -0.287747 | -0.139426 | 0.061603 | 0.133479 |
| max_visibility | 0.030300 | 0.013857 | 0.011294 | -0.010416 | -0.117846 | -0.202143 | -0.201769 | -0.226976 | -0.031649 | -0.039616 | ... | -0.018676 | -0.024187 | 0.010421 | 0.005593 | -0.001860 | -0.099173 | -0.084126 | -0.064567 | -0.001997 | 0.030510 |
| avg_hourly_visibility | 0.173586 | 0.139956 | 0.133159 | 0.084092 | -0.470062 | -0.628284 | -0.606394 | -0.600774 | -0.016716 | -0.022359 | ... | -0.141414 | 0.044135 | 0.136559 | 0.127395 | 0.112277 | -0.412921 | -0.327512 | -0.346730 | 0.053943 | 0.202211 |
| avg_visibility | 0.217006 | 0.186833 | 0.179377 | 0.132254 | -0.544138 | -0.635329 | -0.631286 | -0.586968 | 0.003309 | 0.017349 | ... | -0.191061 | 0.056899 | 0.161484 | 0.149919 | 0.132229 | -0.406746 | -0.340652 | -0.286284 | 0.066972 | 0.260355 |
| min_visibility | 0.238493 | 0.210828 | 0.203317 | 0.158327 | -0.579204 | -0.647268 | -0.642739 | -0.579926 | 0.018065 | 0.037956 | ... | -0.213574 | 0.076970 | 0.182922 | 0.171656 | 0.154454 | -0.427977 | -0.357954 | -0.303580 | 0.078702 | 0.288759 |
| heatdegdays | -0.972667 | -0.982135 | -0.983774 | -0.970362 | -0.085951 | 0.115042 | 0.109676 | 0.228715 | -0.938001 | -0.947515 | ... | 1.000000 | -0.511641 | -0.871134 | -0.838756 | -0.780267 | -0.061782 | -0.138612 | 0.276821 | -0.311740 | -0.917773 |
| cooldegdays | 0.651429 | 0.652993 | 0.657466 | 0.646838 | -0.018506 | -0.119546 | -0.118115 | -0.168318 | 0.610683 | 0.621960 | ... | -0.511641 | 1.000000 | 0.782245 | 0.811555 | 0.862263 | 0.017254 | 0.045273 | -0.098752 | 0.106086 | 0.600270 |
| growdegdays_5 | 0.924878 | 0.926195 | 0.927268 | 0.905461 | 0.012932 | -0.163160 | -0.158784 | -0.252824 | 0.874464 | 0.882097 | ... | -0.871134 | 0.782245 | 1.000000 | 0.995900 | 0.974675 | 0.030329 | 0.085744 | -0.195889 | 0.206545 | 0.876579 |
| growdegdays_7 | 0.901518 | 0.903849 | 0.904992 | 0.884988 | 0.009350 | -0.157998 | -0.153300 | -0.241889 | 0.852964 | 0.861660 | ... | -0.838756 | 0.811555 | 0.995900 | 1.000000 | 0.989167 | 0.026295 | 0.077245 | -0.180298 | 0.192357 | 0.854507 |
| growdegdays_10 | 0.859081 | 0.862624 | 0.864283 | 0.847292 | 0.003303 | -0.150033 | -0.145361 | -0.225457 | 0.812578 | 0.823253 | ... | -0.780267 | 0.862263 | 0.974675 | 0.989167 | 1.000000 | 0.021470 | 0.066414 | -0.159261 | 0.171671 | 0.813289 |
| precipitation | 0.027299 | 0.051095 | 0.057854 | 0.090772 | 0.362622 | 0.407994 | 0.381645 | 0.331215 | 0.168162 | 0.157889 | ... | -0.061782 | 0.017254 | 0.030329 | 0.026295 | 0.021470 | 1.000000 | 0.959758 | 0.241716 | 0.003906 | -0.007350 |
| rain | 0.101303 | 0.124014 | 0.131066 | 0.161167 | 0.338861 | 0.371475 | 0.345622 | 0.292603 | 0.231609 | 0.222395 | ... | -0.138612 | 0.045273 | 0.085744 | 0.077245 | 0.066414 | 0.959758 | 1.000000 | -0.021995 | 0.034456 | 0.067095 |
| snow | -0.268868 | -0.262688 | -0.263320 | -0.250135 | 0.128348 | 0.178007 | 0.172243 | 0.174277 | -0.214667 | -0.219858 | ... | 0.276821 | -0.098752 | -0.195889 | -0.180298 | -0.159261 | 0.241716 | -0.021995 | 1.000000 | -0.104762 | -0.273316 |
| month | 0.270073 | 0.296015 | 0.295486 | 0.316347 | 0.184884 | 0.151327 | 0.142302 | 0.088616 | 0.323504 | 0.340616 | ... | -0.311740 | 0.106086 | 0.206545 | 0.192357 | 0.171671 | 0.003906 | 0.034456 | -0.104762 | 1.000000 | 0.258859 |
| y | 0.927880 | 0.930555 | 0.930185 | 0.908248 | -0.024880 | -0.214626 | -0.210255 | -0.305284 | 0.853536 | 0.869052 | ... | -0.917773 | 0.600270 | 0.876579 | 0.854507 | 0.813289 | -0.007350 | 0.067095 | -0.273316 | 0.258859 | 1.000000 |
38 rows × 38 columns
# Initializing the Ridge Regression Model
ridge_reg = Ridge(alpha = .1)
# Creating a list of predictor columns
X = df.columns[~df.columns.isin(['y'])]
# Time-series Cross Validation
def backtest(df, model, X, start=3650, step=90):
all_predictions = []
for i in range(start, df.shape[0], step):
train = df.iloc[:i,:]
test = df.iloc[i:(i+step),:]
model.fit(train[X], train['y'])
preds = model.predict(test[X])
preds = pd.Series(preds, index = test.index) # index stays the same as our test data
combined = pd.concat([test['y'], preds], axis=1)
combined.columns = ['actual', 'prediction']
combined['diff'] = (combined['prediction'] - combined['actual']).abs()
all_predictions.append(combined)
return pd.concat(all_predictions)
predictions = backtest(df, ridge_reg, X)
predictions
| actual | prediction | diff | |
|---|---|---|---|
| date | |||
| 2006-12-30 | 5.2 | 4.512077 | 0.687923 |
| 2006-12-31 | 9.2 | 5.348854 | 3.851146 |
| 2007-01-01 | 5.8 | 6.055589 | 0.255589 |
| 2007-01-02 | 8.0 | 8.376891 | 0.376891 |
| 2007-01-03 | 11.9 | 7.741514 | 4.158486 |
| ... | ... | ... | ... |
| 2024-06-02 | 23.4 | 21.161758 | 2.238242 |
| 2024-06-03 | 26.8 | 24.138726 | 2.661274 |
| 2024-06-04 | 25.6 | 26.653006 | 1.053006 |
| 2024-06-05 | 26.7 | 25.595051 | 1.104949 |
| 2024-06-06 | 26.7 | 22.887354 | 3.812646 |
6369 rows × 3 columns
# Generating the MSE
mean_squared_error(predictions['actual'], predictions['prediction'])
12.789728514684702
# Generating the MAE
predictions['diff'].mean()
# mean_absolute_error(predictions['actual'], predictions['prediction'])
2.7690746620485
# Generating R2
r2_score(predictions['actual'], predictions['prediction'])
0.9010629886108817
# Checking the errors
predictions['diff'].round().value_counts().sort_index()
diff 0.0 827 1.0 1444 2.0 1242 3.0 937 4.0 659 5.0 471 6.0 305 7.0 211 8.0 124 9.0 71 10.0 34 11.0 24 12.0 11 13.0 3 14.0 4 16.0 2 Name: count, dtype: int64